# Explanation of data files and variables ----

# Data files:

# "ThreeEmotionsCH_sentiment scores.csv": sentiment scores for each lyric calculated by the ChatGPT-assisted approach.
# "ThreeEmotionsCH_Year2_IntensityWithSign_avg.csv": average lyric sentiment score for each year using the ChatGPT-assisted approach.
# "Lyric_split_sentiment scores.csv": sentiment scores for each lyric calculated by the conventional, bottom-up approach.
# "Lyrics_split_Year2_IntensityWithSign.csv": average lyric sentiment score for each year using the conventional, bottom-up approach.
# "Taiwan misery.xlsx": Historical data of Taiwan's economic development, including GDP growth rate, unemployment rate, and CPI (inflation rate); and their standardized versions.
# "manual check.xlsx": evaluation of the predictive performance of the ChatGPT-assisted approach and the lexicon-only approach on a random sample of 200 song lyrics.


# Variables:

# Year2: the year when each song is released.
# count: number of songs released that year.
# IntensityWithSign: the sentiment score for each lyric.
# IntensityWithSign_avg: the average lyric sentiment for each year.
# id: id number for each song.
# threeEmotionsCH: the three Chinese emotion words generated by ChatGPT for each lyric.


library(cosinor)
library(cosinor2)
library(tidyverse)
library(dynlm)
library(lmtest)
library(sandwich)
library(zoo)
library(forcats)
library(readxl)


cbPalette <- c(
  "#000000",  # black
  "#E69F00",  # orange
  "#56B4E9",  # sky blue
  "#009E73",  # bluish green
  "#F0E442",  # yellow
  "#0072B2",  # blue
  "#D55E00",  # vermillion
  "#CC79A7"   # reddish purple
)

setwd("") # Set your working directory

df <- read_csv("ThreeEmotionsCH_Year2_IntensityWithSign_avg.csv")
df$region_coarse[is.na(df$region_coarse)] <- 'unknown'
df$region[is.na(df$region)] <- 'unknown'

dF <- read_csv('ThreeEmotionsCH_sentiment scores.csv')
dF <- dF %>% filter(Year2 >= 1967) %>%
  filter(IntensityWithSign <= 15, IntensityWithSign >= -15)
dF$region_coarse[is.na(dF$region_coarse)] <- 'unknown'

Df <- read_csv("Lyrics_split_Year2_IntensityWithSign.csv")
Df$region_coarse[is.na(Df$region_coarse)] <- 'unknown'
Df$region[is.na(Df$region)] <- 'unknown'

DF <- read_csv('Lyric_split_sentiment scores.csv')
DF$region_coarse[is.na(DF$region_coarse)] <- 'unknown'
DF <- DF %>% filter(Year2 >= 1967)

dF <- dF %>% semi_join(DF, by = c("id" = "id"))
DF <- DF %>% semi_join(dF, by = c("id" = "id")) 


# 1. Creating smooth line and scatter plot ----

## 1.2 Three GPT words----
df1 <- df %>% group_by(Year2) %>%
  summarise(IntensityWithSign_avg = weighted.mean(IntensityWithSign_avg, count), count = sum(count)) %>%
  filter(Year2 >= 1967) %>%
  select(Year2, IntensityWithSign_avg)

cbPalette <- c(
  "Avg. sentiment of songs"       = "#000000",
  "LOESS fitted (span = .5)"           = "#56B4E9"
)

df1 %>% filter(Year2 >= 1967) %>%
  ggplot(aes(x = Year2, y = IntensityWithSign_avg)) +
  geom_point(aes(color = "Avg. sentiment of songs")) +
  geom_smooth(aes(color = "LOESS fitted (span = .5)"), span = 0.5) +
  scale_color_manual(values = cbPalette) +
  labs(x = "Year", y = "Avg. sentiment")+
  theme(legend.position = "bottom")+
  guides(color = guide_legend(title = NULL))+
  scale_x_continuous(breaks = seq(1960, 2020, by = 10))

## 1.2 Lyric split----

Df1 <- Df %>% group_by(Year2) %>%
  summarise(IntensityWithSign_avg = weighted.mean(IntensityWithSign_avg, count), count = sum(count)) 

p <- Df1 %>% filter(Year2 >= 1967) %>%
  ggplot(aes(x = Year2)) +
  geom_point(aes(y = IntensityWithSign_avg, color = "Avg. sentiment of songs")) +
  geom_smooth(aes(y = IntensityWithSign_avg, color = "LOESS fitted (span = .5)"), size = 1.25, span = 0.5) +
  scale_color_manual(values = cbPalette) +
  labs(x = "Year", y = "Avg. sentiment")+
  theme(legend.position = "bottom")+
  guides(color = guide_legend(title = NULL))+
  scale_x_continuous(breaks = seq(1960, 2020, by = 10))

p

ggsave("Fig 4.tiff", plot = p, width = 8, height = 6, dpi = 300, units = "in", compression = "lzw")

# 2. Overall sentiment distribution ----

dF1 <- dF %>% filter(IntensityWithSign <= 15, IntensityWithSign >= -15)

p <- dF1 %>% filter(!is.na(IntensityWithSign)) %>%
  ggplot(aes(x=IntensityWithSign)) + 
  geom_density(adjust = 2, fill = "#56B4E9", color = NA, alpha = 1) +
  scale_x_continuous(breaks = seq(-15, 15, by = 5), limits = c(-15, 15)) +
  guides(color = guide_legend(title = NULL), fill = guide_legend(title = NULL)) +
  labs(x = 'Sentiment score of lyrics', y = 'Probability density')

p

ggsave("Fig 3.tiff", plot = p, width = 8, height = 5, dpi = 300, units = "in", compression = "lzw")

# 3. Spectral analysis and Periodogram ----

Intensity_spec <- stats::spectrum(df1$IntensityWithSign_avg, log = "no", plot = F)
spx <- Intensity_spec$freq
spy <- Intensity_spec$spec*2

sp <- data.frame(spx, spy)

p <- sp %>% ggplot(aes(x=spx, y=spy)) +
  geom_line(size = 1) +
  labs(x = "Frequency (periods/year)", y = "Spectral density")

p

ggsave("Fig 5.tiff", plot = p, width = 8, height = 6, dpi = 300, units = "in", compression = "lzw")

df2 <- df1 %>% mutate(X = "X") %>%
  unite(Year2, X, Year2, sep = "") %>%
  spread(Year2, IntensityWithSign_avg)
Year2 <- 1:57

p <- periodogram(data = df2, time = Year2)

p

ggsave("Fig 6.tiff", plot = p, width = 12, height = 9, dpi = 300, units = "in", compression = "lzw")

# 4. Stimulating 3 scenarios of damping ----

# undamped

t <- 0:70

undamp <- function(t) {3*cos(2*pi/30*t)}
underdamp <- function(t) {3*exp(-0.02*t)*cos(2*pi/30*t)}
critdamp <- function(t) {3*(1 + 0.3*t)*exp(-0.3*t)}
overdamp <- function(t) {3*exp(-0.04*t)}

S_un <- undamp(t)
S_under <- underdamp(t)
S_crit <- critdamp(t)
S_over <- overdamp(t)

sim <- cbind(t, S_un, S_under, S_crit, S_over)

cbPalette <- c(
  "1: Undamped"       = "#000000",
  "2: Over-damped"      = "#CC79A7",
  "3: Critically damped" = "#D55E00",
  "4: Under-damped" = "#0072B2"
)

p <- sim %>% ggplot(aes(x = t)) +
  geom_line(aes(y = S_un, color = "1: Undamped"), linetype = 'dotted', size = 1.5) +
  geom_line(aes(y = S_under, color = "4: Under-damped"), size = 1.2) +
  geom_line(aes(y = S_crit, color = "3: Critically damped"), size = 1.2) +
  geom_line(aes(y = S_over, color = "2: Over-damped"), size = 1.2) +
  scale_color_manual(values = cbPalette) +
  guides(color = guide_legend(title = NULL)) +
  labs(x = 'Time (t)', y = 'Simulated avg. lyric sentiment') +
  theme(legend.position = "bottom")

p

ggsave("Fig 9.tiff", plot = p, width = 8, height = 6, dpi = 300, units = "in", compression = "lzw")

# 5. Baseline model----

mean(df1$IntensityWithSign_avg) # mean estimate
sd(df1$IntensityWithSign_avg)/sqrt(nrow(df1)) # standard error of the mean estimate
sqrt(sum((df1$IntensityWithSign_avg - mean(df1$IntensityWithSign_avg))^2)/(57-1)) # SER (standard error of regression)
sum((df1$IntensityWithSign_avg - mean(df1$IntensityWithSign_avg))^2) # SSE (Sum of squared errors)

# 6. Model 1: Fit simple Cosinor model and plotting ----

fit <- cosinor.lm(IntensityWithSign_avg ~ time(Year2), period = 34, data = df1)
summary(fit)
ggplot_cosinor.lm(fit)
correct.acrophase(fit)


fitCos1 <- function(t, D, fit) {fit$coefficients[[1]] + fit$coefficients[[2]]*cos(2*pi*t/D - fit$coefficients[[3]])}
T <- 1967:2023
fitted1 <- data.frame(IntensityHat_M1 = fitCos1(T, 34, fit))

df3 <- cbind(df1, fitted1)

# Standard Error of Regression (SER) = sqrt(SSE/degrees of freedom)

SER <- sqrt(sum((df3$IntensityWithSign_avg - df3$IntensityHat_M1)^2)/(57-3))
SSE <- sum((df3$IntensityWithSign_avg - df3$IntensityHat_M1)^2)

df3 %>% ggplot(aes(x = Year2)) +
  geom_point(aes(y = IntensityWithSign_avg, color = "Avg. sentiment")) +
  geom_smooth(aes(y = IntensityWithSign_avg, color = "LOESS"), span = 0.5) +
  geom_line(aes(y = IntensityHat_M1, color = "Model 1"), size = 1) +
  labs(x = "Year", y = "Avg. sentiment")+
  theme(legend.position = "bottom")+
  guides(color = guide_legend(title = NULL))+
  scale_x_continuous(breaks = seq(1960, 2020, by = 10))

# 7. Maximum Likelihood Estimation ----

t = 0:56
Y = df3$IntensityWithSign_avg

## 7.1 Model 2 ----

# theta = [sigma^2, A, Phi, T, Beta]

nlf <- function(theta) {
  0.5*57*log(2*pi*theta[1]) + 0.5*sum((Y - theta[2]*exp(-theta[5]*t)*cos(2*pi*t/theta[4] - theta[3]))^2)/theta[1]
}


theta <- c(0.7, 1, 0.9, 34, 0.016)

mle <- nlm(nlf, theta, hessian=TRUE)
# mle <- optim(theta, nlf, hessian = TRUE)

mle$estimate

fitModel2 <- function(t) {mle$estimate[2]*exp(-mle$estimate[5]*t)*cos(2*pi*t/mle$estimate[4]-mle$estimate[3])}

fittedM2 <- data.frame(IntensityHat_M2 = fitModel2(t))

df3 <- cbind(df3, fittedM2)

# Standard Error of Regression (SER) = sqrt(SSE/degrees of freedom)

SER <- sqrt(sum((df3$IntensityWithSign_avg - df3$IntensityHat_M2)^2)/(57-5))
SSE <- sum((df3$IntensityWithSign_avg - df3$IntensityHat_M2)^2)

var_cov_matrix <- solve(mle$hessian)
std_errors <- sqrt(diag(var_cov_matrix))
print(std_errors)

df3 %>% ggplot(aes(x = Year2)) +
  geom_point(aes(y = IntensityWithSign_avg, color = "Avg. sentiment")) +
  geom_smooth(aes(y = IntensityWithSign_avg, color = "LOESS"), span = 0.5) +
  geom_line(aes(y = IntensityHat_M1, color = "Model 1"), size = 1) +
  geom_line(aes(y = IntensityHat_M2, color = "Model 2"), size = 1) +
  labs(x = "Year", y = "Avg. sentiment")+
  theme(legend.position = "bottom")+
  guides(color = guide_legend(title = NULL))+
  scale_x_continuous(breaks = seq(1960, 2020, by = 10))


## 7.2 Model 3 ----

# theta = [sigma^2, A, Phi, T, Beta, B]

nlf <- function(theta) {
  0.5*57*log(2*pi*theta[1]) + 0.5*sum((Y - theta[2]*exp(-theta[5]*t)*cos(2*pi*t/theta[4] - theta[3])-theta[6])^2)/theta[1]
}


theta <- c(0.7, 1, 0.9, 34, 0.016, 0)

mle <- nlm(nlf, theta, hessian=TRUE)
# mle <- optim(theta, nlf, hessian = TRUE)

mle$estimate

fitModel3 <- function(t) {mle$estimate[2]*exp(-mle$estimate[5]*t)*cos(2*pi*t/mle$estimate[4]-mle$estimate[3]) + mle$estimate[6]}

fittedM3 <- data.frame(IntensityHat_M3 = fitModel3(t))

df3 <- cbind(df3, fittedM3)

# Standard Error of Regression (SER) = sqrt(SSE/degrees of freedom)

SER <- sqrt(sum((df3$IntensityWithSign_avg - df3$IntensityHat_M3)^2)/(57-6))
SSE <- sum((df3$IntensityWithSign_avg - df3$IntensityHat_M3)^2)

var_cov_matrix <- solve(mle$hessian)
std_errors <- sqrt(diag(var_cov_matrix))
print(std_errors)

df3 %>% ggplot(aes(x = Year2)) +
  geom_point(aes(y = IntensityWithSign_avg, color = "Avg. sentiment")) +
  geom_smooth(aes(y = IntensityWithSign_avg, color = "LOESS"), span = 0.5) +
  geom_line(aes(y = IntensityHat_M1, color = "Model 1"), size = 1) +
  geom_line(aes(y = IntensityHat_M2, color = "Model 2"), size = 1) +
  geom_line(aes(y = IntensityHat_M3, color = "Model 3"), size = 1) +
  labs(x = "Year", y = "Avg. sentiment")+
  theme(legend.position = "bottom")+
  guides(color = guide_legend(title = NULL))+
  scale_x_continuous(breaks = seq(1960, 2020, by = 10))

## 7.3 Model 4 ----

# theta = [sigma^2, A, Phi, T, Beta, B, C]

nlf <- function(theta) {
  0.5*57*log(2*pi*theta[1]) + 0.5*sum((Y - theta[2]*exp(-theta[5]*t)*cos(2*pi*t/theta[4] - theta[3])-theta[6] - theta[7]*t)^2)/theta[1]
}


theta <- c(0.7, 1, 0.9, 34, 0.016, 0, 0)

mle <- nlm(nlf, theta, hessian=TRUE)
# mle <- optim(theta, nlf, hessian = TRUE)

mle$estimate

fitModel4 <- function(t) {mle$estimate[2]*exp(-mle$estimate[5]*t)*cos(2*pi*t/mle$estimate[4]-mle$estimate[3]) + mle$estimate[6] + mle$estimate[7]*t}

fittedM4 <- data.frame(IntensityHat_M4 = fitModel4(t))

df3 <- cbind(df3, fittedM4)

# Standard Error of Regression (SER) = sqrt(SSE/degrees of freedom)

SER <- sqrt(sum((df3$IntensityWithSign_avg - df3$IntensityHat_M4)^2)/(57-7))
SSE <- sum((df3$IntensityWithSign_avg - df3$IntensityHat_M4)^2)

var_cov_matrix <- solve(mle$hessian)
std_errors <- sqrt(diag(var_cov_matrix))
print(std_errors)

df3 %>% ggplot(aes(x = Year2)) +
  geom_point(aes(y = IntensityWithSign_avg, color = "Avg. sentiment")) +
  geom_smooth(aes(y = IntensityWithSign_avg, color = "LOESS"), span = 0.5) +
  geom_line(aes(y = IntensityHat_M1, color = "Model 1"), size = 1) +
  geom_line(aes(y = IntensityHat_M2, color = "Model 2"), size = 1) +
  geom_line(aes(y = IntensityHat_M3, color = "Model 3"), size = 1) +
  geom_line(aes(y = IntensityHat_M4, color = "Model 4"), size = 1) +
  labs(x = "Year", y = "Avg. sentiment")+
  theme(legend.position = "bottom")+
  guides(color = guide_legend(title = NULL))+
  scale_x_continuous(breaks = seq(1960, 2020, by = 10))

## 7.4 Model 5 ----

# theta = [sigma^2, A, Phi, T, Beta, B, D]

nlf <- function(theta) {
  0.5*57*log(2*pi*theta[1]) + 0.5*sum((Y - theta[2]*exp(-theta[5]*t)*cos(2*pi*t/(theta[4]+theta[7]*t) - theta[3])-theta[6])^2)/theta[1]
}


theta <- c(0.7, 1, 0.9, 34, 0.016, 0, 0)

mle <- nlm(nlf, theta, hessian=TRUE)
# mle <- optim(theta, nlf, hessian = TRUE)

mle$estimate

fitModel5 <- function(t) {mle$estimate[2]*exp(-mle$estimate[5]*t)*cos(2*pi*t/(mle$estimate[4]+mle$estimate[7]*t)-mle$estimate[3]) + mle$estimate[6]}

fittedM5 <- data.frame(IntensityHat_M5 = fitModel5(t))

df3 <- cbind(df3, fittedM5)

# Standard Error of Regression (SER) = sqrt(SSE/degrees of freedom)

SER <- sqrt(sum((df3$IntensityWithSign_avg - df3$IntensityHat_M5)^2)/(57-7))
SSE <- sum((df3$IntensityWithSign_avg - df3$IntensityHat_M5)^2)

var_cov_matrix <- solve(mle$hessian)
std_errors <- sqrt(diag(var_cov_matrix))
print(std_errors)


df3 %>% ggplot(aes(x = Year2)) +
  geom_point(aes(y = IntensityWithSign_avg, color = "Avg. sentiment")) +
  geom_smooth(aes(y = IntensityWithSign_avg, color = "LOESS"), span = 0.5) +
  geom_line(aes(y = IntensityHat_M1, color = "Model 1"), size = 1) +
  geom_line(aes(y = IntensityHat_M2, color = "Model 2"), size = 1) +
  geom_line(aes(y = IntensityHat_M3, color = "Model 3"), size = 1) +
  geom_line(aes(y = IntensityHat_M4, color = "Model 4"), size = 1) +
  geom_line(aes(y = IntensityHat_M5, color = "Model 5"), size = 1) +
  labs(x = "Year", y = "Avg. sentiment")+
  theme(legend.position = "bottom")+
  guides(color = guide_legend(title = NULL))+
  scale_x_continuous(breaks = seq(1960, 2020, by = 10))

## 7.5. Model 6 ----

# theta = [sigma^2, A, Phi, T, Beta, D]

nlf <- function(theta) {
  0.5*57*log(2*pi*theta[1]) + 0.5*sum((Y - theta[2]*exp(-theta[5]*t)*cos(2*pi*t/(theta[4]+theta[6]*t) - theta[3]))^2)/theta[1]
}


theta <- c(0.7, 1, 0.9, 34, 0.016, 0)

mle <- nlm(nlf, theta, hessian=TRUE)
# mle <- optim(theta, nlf, hessian = TRUE)

mle$estimate

fitModel6 <- function(t) {mle$estimate[2]*exp(-mle$estimate[5]*t)*cos(2*pi*t/(mle$estimate[4]+mle$estimate[6]*t)-mle$estimate[3])}

fittedM6 <- data.frame(IntensityHat_M6 = fitModel6(t))

df3 <- cbind(df3, fittedM6)

# Standard Error of Regression (SER) = sqrt(SSE/degrees of freedom)

SER <- sqrt(sum((df3$IntensityWithSign_avg - df3$IntensityHat_M6)^2)/(57-6))
SSE <- sum((df3$IntensityWithSign_avg - df3$IntensityHat_M6)^2)

var_cov_matrix <- solve(mle$hessian)
std_errors <- sqrt(diag(var_cov_matrix))
print(std_errors)

df3 %>% ggplot(aes(x = Year2)) +
  geom_point(aes(y = IntensityWithSign_avg, color = "Avg. sentiment")) +
  geom_smooth(aes(y = IntensityWithSign_avg, color = "LOESS"), span = 0.5) +
  geom_line(aes(y = IntensityHat_M1, color = "Model 1"), size = 1) +
  geom_line(aes(y = IntensityHat_M2, color = "Model 2"), size = 1) +
  geom_line(aes(y = IntensityHat_M3, color = "Model 3"), size = 1) +
  geom_line(aes(y = IntensityHat_M4, color = "Model 4"), size = 1) +
  geom_line(aes(y = IntensityHat_M5, color = "Model 5"), size = 1) +
  geom_line(aes(y = IntensityHat_M6, color = "Model 6"), size = 1) +
  labs(x = "Year", y = "Avg. sentiment")+
  theme(legend.position = "bottom")+
  guides(color = guide_legend(title = NULL))+
  scale_x_continuous(breaks = seq(1960, 2020, by = 10))

# 8. Model 7: Estimating 2nd-order Auto-regression model using the LOESS estimations ----

loess_model <- loess(IntensityWithSign_avg ~ Year2, data = df3, span = 0.5)
predicted_values <- predict(loess_model, newdata = df3)
df3$IntensityHat_Loess <- predicted_values

Intensity_AR2 <- dynlm(ts(df3$IntensityHat_Loess) ~ L(ts(df3$IntensityHat_Loess)) + L(ts(df3$IntensityHat_Loess), 2))

# Intensity_AR2 <- dynlm(ts(df4$IntensityWithSign_avg) ~ L(ts(df4$IntensityWithSign_avg)) + L(ts(df4$IntensityWithSign_avg), 2))


# Compute the robust covariance matrix
robust_cov <- vcovHC(Intensity_AR2, type = "HC0")

# Perform hypothesis tests on the coefficients using the robust covariance matrix
coefficient_tests <- coeftest(Intensity_AR2, vcov = robust_cov)

# Print the results
print(coefficient_tests)

# Computing predicted sentiment using estimated AR2 model:

df3$IntensityHat_M7 <- df3$IntensityHat_Loess



for (i in 3:57) {
  df3$IntensityHat_M7[i] = Intensity_AR2$coefficients[2]*df3$IntensityHat_M7[i-1] + Intensity_AR2$coefficients[3]*df3$IntensityHat_M7[i-2] + Intensity_AR2$coefficients[1]
  # df4$IntensityHat_AR2[i] = 2*(1-lse$estimate[[2]])*df4$IntensityHat_AR2[i-1] + (2*lse$estimate[[2]] - ((2*pi/lse$estimate[[4]])^2 + lse$estimate[[2]]^2) - 1)*df4$IntensityHat_AR2[i-2]
}

# Standard Error of Regression (SER) = sqrt(SSE/degrees of freedom)

SER <- sqrt(sum((df3$IntensityWithSign_avg - df3$IntensityHat_M7)^2)/(57-3))
SSE <- sum((df3$IntensityWithSign_avg - df3$IntensityHat_M7)^2)

SSE_Loess <- sum((df3$IntensityWithSign_avg - df3$IntensityHat_Loess)^2)


cbPalette <- c(
  "Avg. sentiment"       = "#000000",
  "Model 7 (autoregression)"      = "#E69F00",
  "Model 2 (best)" = "#009E73",
  "LOESS (span = .5)" = "#56B4E9"
)

p <- df3 %>% ggplot(aes(x = Year2)) +
  geom_point(aes(y = IntensityWithSign_avg, color = "Avg. sentiment")) +
  geom_smooth(aes(y = IntensityWithSign_avg, color = "LOESS (span = .5)"), span = 0.5, size = 1) +
  geom_line(aes(y = IntensityHat_M2, color = "Model 2 (best)"), size = 1) +
  geom_line(aes(y = IntensityHat_M7, color = "Model 7 (autoregression)"), size = 1) +
  scale_color_manual(values = cbPalette) +
  labs(x = "Year", y = "Avg. sentiment")+
  theme(legend.position = "bottom")+
  guides(color = guide_legend(title = NULL))+
  scale_x_continuous(breaks = seq(1960, 2020, by = 10))

p

ggsave("Fig 10.tiff", plot = p, width = 8, height = 6, dpi = 300, units = "in", compression = "lzw")

# 9. Residual Analysis/Model Diagnosis ----

### residuals against year2 for LOESS

resi <- data.frame(resid_Loess = df3$IntensityHat_Loess - df3$IntensityWithSign_avg)

df3 <- cbind(df3, resi)

df3 %>% ggplot(aes(x = Year2)) +
  geom_point(aes(y = resid_Loess)) +
  geom_line(aes(y = 0))

### residuals against year2 for Model 2

resi <- data.frame(resid_M2 = df3$IntensityHat_M2 - df3$IntensityWithSign_avg)

df3 <- cbind(df3, resi)

df3 %>% ggplot(aes(x = Year2)) +
  geom_point(aes(y = resid_M2)) +
  geom_line(aes(y = 0))

### Residual this year against Residual next year

# Align the residuals
resi <- as.vector(resi$resid_M2)

resid1 <- data.frame(resi1 = head(resi, -1)) # Remove the last element
resid2 <- data.frame(resi2 = tail(resi, -1))  # Remove the first element

# Combine into one data frame
resi_lag <- cbind(resid1, resid2)

# Use ggplot2 to plot
ggplot(resi_lag, aes(x = resi1, y = resi2)) +
  geom_point() +
  labs(x = "Residual this year", y = "Residual next year", title = "Lag Plot of Residuals")

### density plot for residuals

resi <- data.frame(resid_M2 = df3$IntensityHat_M2 - df3$IntensityWithSign_avg)

resi %>% ggplot(aes(x = resid_M2)) +
  geom_density()

resi <- as.vector(resi$resid_M2)

qqnorm(resi)
qqline(resi)


# 10. Calculating correlation coefficient between MLE (and LOESS) estimated sentiment and actual sentiment ----

cor.test(df3$IntensityWithSign_avg, df3$IntensityHat_M2)

cor.test(df3$IntensityWithSign_avg, df3$IntensityHat_Loess)

cor.test(df3$IntensityHat_M2, df3$IntensityHat_Loess)

# 11. Nonlear Least Squares Estimation for Model 2----

# theta = [A, Phi, T, Beta]

sse <- function(theta) {
  sum((Y - theta[1]*exp(-theta[4]*t)*cos(2*pi*t/theta[3] - theta[2]))^2)
}

theta <- c(1, 0.9, 34, 0.016)

lse <- nlm(sse, theta, hessian = T)

lse$estimate

# Extract the Hessian matrix
hessian_matrix <- lse$hessian

# Calculate the variance-covariance matrix (assuming homoscedastic errors)
residual_variance <- lse$minimum / (length(Y) - length(lse$estimate))
var_cov_matrix <- solve(hessian_matrix) * residual_variance

# Calculate standard errors
std_errors <- sqrt(diag(var_cov_matrix))

print(std_errors)

# lse <- optim(theta, sse, hessian = F)
# A = lse$par[[1]]
# C = lse$par[[2]]

# 12. Computing 10-year moving avg of sentiment ----

years <- 1967:2023
values <- df3$IntensityWithSign_avg
ts_data <- ts(values, start = min(years), frequency = 1)

# Calculate the 10-year moving average with different fill options
moving_avg_na <- rollmean(ts_data, k = 10, fill = NA)

df3$IntensityHat_MA10 <- moving_avg_na

df3 %>% ggplot(aes(x = Year2)) +
  geom_point(aes(y = IntensityWithSign_avg, color = "Avg. sentiment")) +
  geom_smooth(aes(y = IntensityWithSign_avg, color = "LOESS"), span = 0.5) +
  geom_line(aes(y = IntensityHat_MA10, color = "10-year moving avg."), size = 1) +
  labs(x = "Year", y = "Avg. sentiment")+
  theme(legend.position = "bottom")+
  guides(color = guide_legend(title = NULL))+
  scale_x_continuous(breaks = seq(1960, 2020, by = 10))

# 13. Correlate sentiment score with economic variables----

tw <- read_xlsx("Taiwan misery.xlsx")
tp <- tw %>% left_join(df1, by = c("year" = "Year2"))

tp$cpi_s <- scale(tp$cpi)
tp$cpi_s <- scale(tp$cpi)
tp$gdp_s <- scale(tp$gdp)
tp$misery_s <- scale(tp$misery)
tp$sentiment <- scale(tp$IntensityWithSign_avg)

## 13.1 Correlate sentiment with GDP growth rate----

p <- tp %>% ggplot(aes(x = year)) +
  geom_point(aes(y = gdp_s), color = "red") +
  geom_smooth(aes(y = gdp_s),color = "red", span = 0.5)+
  geom_point(aes(y = sentiment), color = "blue") +
  geom_smooth(aes(y = sentiment),color = "blue", span = 0.5)+
  labs(x = "Year", y = "Standardized values")+
  guides(color = guide_legend(title = NULL))+
  scale_x_continuous(breaks = seq(1960, 2020, by = 10))

p

ggsave("Fig S1.tiff", plot = p, width = 8, height = 6, dpi = 300, units = "in", compression = "lzw")


### 13.1.1 spectral analysis----
gdp_spec <- stats::spectrum(tp$gdp_s, log = "no", plot = F)
spx <- gdp_spec$freq
spy <- gdp_spec$spec*2

sp <- data.frame(spx, spy)

p <- sp %>% ggplot(aes(x=spx, y=spy)) +
  geom_line(size = 1) +
  labs(x = "Frequency (periods/year)", y = "Spectral density")

p

ggsave("Fig S2.tiff", plot = p, width = 8, height = 6, dpi = 300, units = "in", compression = "lzw")


tp1 <- tp %>%
  select(year, gdp_s)  %>% 
  mutate(X = "X") %>%
  unite(year, X, year, sep = "") %>%
  spread(year, gdp_s)
year <- 1:ncol(tp1)

periodogram(data = tp1, time = year)



## 13.2 Correlate sentiment with Economic Misery Index----

p <- tp %>% ggplot(aes(x = year)) +
  geom_point(aes(y = misery_s), color = "red") +
  geom_smooth(aes(y = misery_s),color = "red", span = 0.5)+
  geom_point(aes(y = sentiment), color = "blue") +
  geom_smooth(aes(y = sentiment),color = "blue", span = 0.5)+
  labs(x = "Year", y = "Standardized values")+
  guides(color = guide_legend(title = NULL))+
  scale_x_continuous(breaks = seq(1960, 2020, by = 10))

p

ggsave("Fig S3.tiff", plot = p, width = 8, height = 6, dpi = 300, units = "in", compression = "lzw")

### 13.2.1 spectral analysis----

tp2 <-tp[!is.na(tp$misery_s),]
misery_spec <- stats::spectrum(tp2$misery_s, log = "no", plot = F)
spx <- misery_spec$freq
spy <- misery_spec$spec*2

sp <- data.frame(spx, spy)

p <- sp %>% ggplot(aes(x=spx, y=spy)) +
  geom_line(size = 1) +
  labs(x = "Frequency (periods/year)", y = "Spectral density")

p

ggsave("Fig S4.tiff", plot = p, width = 8, height = 6, dpi = 300, units = "in", compression = "lzw")


tp3 <- tp2 %>%
  select(year, misery_s)  %>% 
  mutate(X = "X") %>%
  unite(year, X, year, sep = "") %>%
  spread(year, misery_s)
year <- 1:ncol(tp3)

p <- periodogram(data = tp3, time = year)

p

ggsave("Fig S5.tiff", plot = p, width = 8, height = 6, dpi = 300, units = "in", compression = "lzw")


# 14. Calculate correlation coefficent between the ChatGPT-generated emo scores and the bottom-up-generated emo scores----


tp1 <- dF %>% select(id, IntensityWithSign) %>%
  rename(IntensityWithSign_GPT = IntensityWithSign)

tp2 <- DF %>% select(id, IntensityWithSign)

tp3 <- tp1 %>% left_join(tp2, by = c("id" = "id")) %>%
  mutate(Intensity_GPT = scale(IntensityWithSign_GPT),
         Intensity = scale(IntensityWithSign))

cor.test(tp3$Intensity_GPT, tp3$Intensity)

tp3 %>% sample_n(1000) %>%
  ggplot(aes(x = Intensity_GPT, y = Intensity)) +
  geom_point() +
  geom_smooth(method = 'lm')

# 15. Manual check for evaluating ChatGPT-assisted vs. Lexicon-only methods ----
tp <- read_xlsx("manual check.xlsx")
tp$gpt <- ifelse(tp$IntensityWithSign1 < 0, "N", "P")
tp$lexicon <- ifelse(tp$IntensityWithSign2 < 0, "N", "P")

tp$gpt_r <- tp$Manual_Check == tp$gpt
tp$lexicon_r <- tp$Manual_Check == tp$lexicon

tp$gpt_T_P <- tp$Manual_Check == "P" & tp$gpt == 'P'
tp$gpt_F_P <- tp$Manual_Check == "N" & tp$gpt == 'P'
tp$gpt_T_N <- tp$Manual_Check == "N" & tp$gpt == 'N'
tp$gpt_F_N <- tp$Manual_Check == "P" & tp$gpt == 'N'

tp$lexicon_T_P <- tp$Manual_Check == "P" & tp$lexicon == 'P'
tp$lexicon_F_P <- tp$Manual_Check == "N" & tp$lexicon == 'P'
tp$lexicon_T_N <- tp$Manual_Check == "N" & tp$lexicon == 'N'
tp$lexicon_F_N <- tp$Manual_Check == "P" & tp$lexicon == 'N'


sum(tp$gpt_r)
sum(tp$lexicon_r)

gptTP <- sum(tp$gpt_T_P)
gptFP <- sum(tp$gpt_F_P)
gptTN <- sum(tp$gpt_T_N)
gptFN <- sum(tp$gpt_F_N)

lexiconTP <- sum(tp$lexicon_T_P)
lexiconFP <- sum(tp$lexicon_F_P)
lexiconTN <- sum(tp$lexicon_T_N)
lexiconFN <- sum(tp$lexicon_F_N)

precision_gptP <- gptTP/(gptTP + gptFP)
precision_gptN <- gptTN/(gptTN + gptFN)
precision_gpt_avg <- (precision_gptP + precision_gptN)/2

recall_gptP <- gptTP/(gptTP + gptFN)
recall_gptN <- gptTN/(gptTN + gptFP)
recall_gpt_avg <- (recall_gptP + recall_gptN)/2

f1_gptP <- 2*(precision_gptP * recall_gptP)/(precision_gptP + recall_gptP)
f1_gptN <- 2*(precision_gptN * recall_gptN)/(precision_gptN + recall_gptN)
f1_gpt_avg <- (f1_gptP + f1_gptN)/2

precision_lexiconP <- lexiconTP/(lexiconTP + lexiconFP)
precision_lexiconN <- lexiconTN/(lexiconTN + lexiconFN)
precision_lexicon_avg <- (precision_lexiconP + precision_lexiconN)/2

recall_lexiconP <- lexiconTP/(lexiconTP + lexiconFN)
recall_lexiconN <- lexiconTN/(lexiconTN + lexiconFP)
recall_lexicon_avg <- (recall_lexiconP + recall_lexiconN)/2

f1_lexiconP <- 2*(precision_lexiconP * recall_lexiconP)/(precision_lexiconP + recall_lexiconP)
f1_lexiconN <- 2*(precision_lexiconN * recall_lexiconN)/(precision_lexiconN + recall_lexiconN)
f1_lexicon_avg <- (f1_lexiconP + f1_lexiconN)/2

precision_gptP
precision_gptN 
precision_gpt_avg

recall_gptP
recall_gptN 
recall_gpt_avg 

f1_gptP
f1_gptN 
f1_gpt_avg 

precision_lexiconP 
precision_lexiconN 
precision_lexicon_avg 

recall_lexiconP
recall_lexiconN 
recall_lexicon_avg 

f1_lexiconP 
f1_lexiconN 
f1_lexicon_avg

table("ChatGPT-assisted" = tp$gpt_r, "Lexicon-only" = tp$lexicon_r)
